## Warning: package 'BiocGenerics' was built under R version 4.0.5
## Warning: package 'GenomeInfoDb' was built under R version 4.0.5
load("~/Documents/MiGASti/Databases/gene_matrix.RData")
metadata <- read.table("~/Documents/MiGASti/Databases/metadata.txt")
#set rownames to Sample
setwd("~/Documents/MiGASti/Databases")
row.names(metadata) <- metadata$Sample 
exclude <- read.table("samples2remove.txt")
exclude <- exclude$x
genes_counts_filt = genes_counts[, !colnames(genes_counts) %in% exclude] 
#Excludes the samples from filters. 
#dim(genes_tpm_filt)
metadata_filt = metadata[ !(rownames(metadata) %in% exclude), ]
length(metadata_filt)

[1] 38

#remove low count genes
cpm <- cpm(genes_counts_filt) 
# CPM >= 1 in at least 50% of the samples
keep.exp <- rowSums(cpm > 1) >= (0.5 * ncol(genes_counts_filt) )
genes_counts_filt1 <- genes_counts_filt[ keep.exp, ] #18625 genes 
#performing voom normalisation on data
counts_voom <- limma::voom(genes_counts_filt1)
genes_counts_voom <- counts_voom$E
#order metadata and genes counts
#rownames(metadata)
#colnames(genes_counts_voom)
genes_counts_ordered <- genes_counts_voom[,rownames(metadata_filt)]
#head(genes_counts_ordered)
all(rownames(metadata_filt) == colnames (genes_counts_ordered)) #TRUE

[1] TRUE

PCA

#After filtering: 496 samples in total

pca = prcomp(t(genes_counts_ordered), scale. = TRUE, center = TRUE)
autoplot(pca, data= metadata_filt, colour = 'Stimulation', shape = FALSE, label.size = 2)

PCA excluding uncultured samples

#After filtering: 454 samples in total

#remove ununstim samples in metadata
metadata_cultured <- metadata_filt[metadata_filt$Stimulation != "ununstim",]
#check numbers
dim(metadata_cultured)

[1] 454 38

#check numbers per stimulation
table(metadata_filt$Stimulation)
 ATP      DEX     IFNy      IL4      LPS     R848     TNFa   unstim 
   5       10       79        8      128       42       47      135 

ununstim 42

#remove samples in genes counts datafile
genes_counts_cultured <- genes_counts_ordered[,metadata_cultured$Sample]

res.pca = prcomp(t(genes_counts_cultured)) 

g1 <- autoplot(res.pca, data = metadata_cultured, colour = 'Stimulation') 

allResiduals <- removeBatchEffect(x = genes_counts_cultured, 
                                  design = model.matrix(~ Stimulation, data = metadata_cultured), 
                                  covariates = as.matrix(metadata_cultured[, c("picard_pct_mrna_bases", "picard_pct_ribosomal_bases", "picard_pct_pf_reads_aligned", "picard_pct_percent_duplication")]))

res.pca = prcomp(t(allResiduals)) 

g6 <- autoplot(res.pca, data = metadata_cultured, colour = 'Stimulation') 

# dev.off()
ggarrange(g1,g6, labels = c("a", "b"))

Summary of the PCA

#how many PCS with > 95% variance in total?

#summary(pca)
#Proportion of variance per PC
PoV <- pca$sdev^2/sum(pca$sdev^2)
# Calculate cumulative percents for each PC
cumu <- cumsum(PoV)
# Determine which PC exhibits cumulative percent greater than 95% and % variation associated with the PC as less than 20
co1 <- which(cumu > 0.95)[1]
co1 

[1] 205

Heatmap with 20 PCS

#Including uncultured: 496 samples in total

#set metadata to no capitals
names(metadata_filt) = tolower(names(metadata_filt))
indx <- sapply(metadata, is.character)
metadata[indx] <- lapply(metadata[indx], function(x) as.factor(x))

#include covariates
covariates = c( "donor_id", 
                "region",
                "stimulation",
                "number",
                "average_library_size",
                "qubit_conc",
                "diagnosis",
                "main_diagnosis",
                "sex",
                "age",
                "pmd_minutes",
                "ph",
                "cause_of_death_categories",
                "smoking",
                "alcohol_dependence_daily_use",
                "autoimmune_diseases",
                "infection_2weeks",
                "opiates",
                "benzodiazepines",
                "psychopharmaca",
                "year_collected",
                "trimmomatic_dropped_pct", 
                "picard_pct_mrna_bases",
                "picard_pct_ribosomal_bases",
                "picard_pct_intronic_bases",
                "picard_pct_intergenic_bases",
                "picard_pct_pf_reads_aligned",
                "picard_summed_median",
                "picard_summed_mean",
                "picard_pct_percent_duplication",
                "star_total_reads",
                "star_uniquely_mapped", 
                "star_uniquely_mapped_percent",
                "featurecounts_assigned")

#create format matrix 
matrix_rsquared = matrix(NA, nrow = length(covariates), ncol = 20) #Number of factors
matrix_pvalue = matrix(NA, nrow = length(covariates), ncol = 20)

#lineair model function 
for (x in 1:length(covariates)){
  for (y in 1:20){
    matrix_rsquared[x,y] <- summary( lm(pca$x[,y] ~ metadata_filt[,covariates[x]]) )$adj.r.squared
    matrix_pvalue[x,y] <- tidy( lm(pca$x[,y] ~ metadata_filt[,covariates[x]]) )$p.value[2] #To insert pvalues in the heatmap
  }
}

#fill matrix with values 
matrix_rsquared <- as.data.frame(matrix_rsquared)
matrix_pvalue <- as.data.frame(matrix_pvalue)
rownames(matrix_rsquared) <- covariates
rownames(matrix_pvalue) <- covariates 

#create heatmap with rsquared values 
pheatmap(matrix_rsquared, main = "Correlation (Rsquared) between variables and first 20 PCs", legend = TRUE)

Boxplot PC3

#Variance mainly driven by ununstim

PC3 <- pca$x[,3]
Stimulation <- metadata_filt$stimulation
df = data.frame(Stimulation, PC3)
ggplot(data = df, mapping = aes(x = Stimulation, y = PC3)) +
  geom_boxplot()

Heatmap with 20 PCS after removal of uncultured samples (ununstim)

#Excluding uncultured: 454 samples in total

#remove ununstim samples in metadata
metadata_cultured <- metadata_filt[metadata_filt$stimulation != "ununstim",]
#check numbers
dim(metadata_cultured)

[1] 454 38

#check numbers per stimulation
table(metadata_filt$stimulation)
 ATP      DEX     IFNy      IL4      LPS     R848     TNFa   unstim 
   5       10       79        8      128       42       47      135 

ununstim 42

#remove samples in genes counts datafile 
genes_counts_cultured <- genes_counts_ordered[,metadata_cultured$sample]
#table(metadata_filt$stimulation)
#lenght(metadata_cultured)
pca = prcomp(t(genes_counts_cultured), scale. = TRUE, center = TRUE)

#set metadata to no capitals
names(metadata_cultured) = tolower(names(metadata_cultured))
indx <- sapply(metadata, is.character)
metadata[indx] <- lapply(metadata[indx], function(x) as.factor(x))

#include covariates
covariates = c( "donor_id", 
                "region",
                "stimulation",
                "number",
                "average_library_size",
                "qubit_conc",
                "diagnosis",
                "main_diagnosis",
                "sex",
                "age",
                "pmd_minutes",
                "ph",
                "cause_of_death_categories",
                "smoking",
                "alcohol_dependence_daily_use",
                "autoimmune_diseases",
                "infection_2weeks",
                "opiates",
                "benzodiazepines",
                "psychopharmaca",
                "year_collected",
                "trimmomatic_dropped_pct", 
                "picard_pct_mrna_bases",
                "picard_pct_ribosomal_bases",
                "picard_pct_intronic_bases",
                "picard_pct_intergenic_bases",
                "picard_pct_pf_reads_aligned",
                "picard_summed_median",
                "picard_summed_mean",
                "picard_pct_percent_duplication",
                "star_total_reads",
                "star_uniquely_mapped", 
                "star_uniquely_mapped_percent",
                "featurecounts_assigned")

#create format matrix 
matrix_rsquared = matrix(NA, nrow = length(covariates), ncol = 20) #Number of factors
matrix_pvalue = matrix(NA, nrow = length(covariates), ncol = 20)

#lineair model function 
for (x in 1:length(covariates)){
  for (y in 1:20){
    matrix_rsquared[x,y] <- summary( lm(pca$x[,y] ~ metadata_cultured[,covariates[x]]) )$adj.r.squared
    matrix_pvalue[x,y] <- tidy( lm(pca$x[,y] ~ metadata_cultured[,covariates[x]]) )$p.value[2] #To insert pvalues in the heatmap
  }
}

#fill matrix with values 
matrix_rsquared <- as.data.frame(matrix_rsquared)
matrix_pvalue <- as.data.frame(matrix_pvalue)
rownames(matrix_rsquared) <- covariates
rownames(matrix_pvalue) <- covariates 

#create heatmap with rsquared values 
pheatmap(matrix_rsquared, main = "Correlation (Rsquared) between variables and first 20 PCs", legend = TRUE)

Boxplot PC16

PC16 <- pca$x[,16]
Stimulation <- metadata_cultured$stimulation
df = data.frame(Stimulation, PC16)
ggplot(data = df, mapping = aes(x = Stimulation, y = PC16)) +
  geom_boxplot()

Boxplot PC12

PC12 <- pca$x[,12]
Stimulation <- metadata_cultured$stimulation
df = data.frame(Stimulation, PC12)
ggplot(data = df, mapping = aes(x = Stimulation, y = PC12)) +
  geom_boxplot()

PCA of 1 region (excluding uncultured)

GFM

metadata_GFM = metadata_cultured[metadata_cultured$region=="GFM",]
#check numbers
#dim(metadata_GFM)
#check numbers per stimulation
#table(metadata_cultured$region)
#remove samples in genes counts datafile 
genes_counts_GFM <- genes_counts_cultured[,metadata_GFM$sample]
pca = prcomp(t(genes_counts_GFM), scale. = TRUE, center = TRUE)
autoplot(pca, colour = "stimulation", data = metadata_GFM)

SVZ

metadata_SVZ = metadata_cultured[metadata_cultured$region=="SVZ",]
#check numbers
#dim(metadata_SVZ)
#check numbers per stimulation
#table(metadata_cultured$region)
#remove samples in genes counts datafile 
genes_counts_SVZ <- genes_counts_cultured[,metadata_SVZ$sample]
pca = prcomp(t(genes_counts_SVZ), scale. = TRUE, center = TRUE)
autoplot(pca, colour = "stimulation", data = metadata_SVZ)

PCA most variable genes (excluding uncultured)

1000 most variable genes

rv <- rowVars(genes_counts_cultured)
  select <- order(rv, decreasing = TRUE)[1:1000]
  pca <- prcomp(t(genes_counts_cultured[select, ]), scale. = TRUE, center = TRUE)
  autoplot(pca, data= metadata_cultured, colour = 'stimulation', shape = FALSE, label.size = 2)
  autoplot(pca, colour = "stimulation", data = metadata_cultured)

500 most variable genes

rv <- rowVars(genes_counts_cultured)
  select <- order(rv, decreasing = TRUE)[1:500]
  pca <- prcomp(t(genes_counts_cultured[select, ]), scale. = TRUE, center = TRUE)
  autoplot(pca, data= metadata_cultured, colour = 'stimulation', shape = FALSE, label.size = 2)
  autoplot(pca, colour = "stimulation", data = metadata_cultured)